Rationale: this is basic QC for all samples of pure culture mycobiont and lichen. We will use to select which sample we canuse and comparisons
counts<-read.delim2("../analysis_and_temp_files/06_meta_mapping/kallisto_mycobiont_only.txt",sep=" ")
metadata<-read.csv2("../analysis_and_temp_files/06_meta_mapping/metadata_shared_with_neha.csv",sep=",")
length(unique(counts$sample)) #this should be 31 as in the metadat.csv file
## [1] 31
counts$tpm<-round(as.numeric(counts$tpm),digits = 3) #round off tpm upto 3 decimals
counts_tab<-pivot_wider(counts[c(1,5,6)],names_from=sample,values_from = tpm) #widen the table
all_counts<- counts_tab[rowSums(counts_tab[c(2:32)])>0,] #remove rows which have 0 in all samples
all<-cor(all_counts[c(2:32)], method="pearson",use="pairwise.complete.obs")
ord_all<-c("XBA1","XBA2","XSA1","XSA2","XSA2_2","XTA1","XTA2","XBC1","XBC2","XMC2","XSC1",
"XSC2","XTC2","XBE1","XBE2","XSE2","XTE2","MP_I","MP_II","KS48XB1","KS48XB2","KS48XB3",
"KS9XB1","KS9XB2","KS9XB3","S_21XB1","KS21XB1","S_21XB3","S_42XB1","S_42XB2","S_42XB3") #arrange order according to biological replicates (optional)
all1 <- all[, ord_all]
all1<-all1[ord_all, ]
# For correlation plot
corrplot(all1,method="color",type="full",is.corr=F, col=col2(200),addgrid.col = 0,tl.cex = 1, tl.col = "black")
# for PCA
plotMDS(all_counts[c(2:32)],main="All samples")
# For heatmap
heat1<-column_to_rownames(all_counts, var = "target_id")
exp_matrix<-data.matrix(heat1)
mycols = colorRampPalette(c("blue","white","red"))(100)
hc_rows <- hclust(as.dist(1-cor(t(exp_matrix), method="pearson")), method="average") # for clustering genes
hc_cols <- hclust(as.dist(1 - cor(exp_matrix, method = "pearson")), method = "average") # to arrange the sample names at x-axis according to hierarchical clustering
htmap<-heatmap.2(exp_matrix, Colv=as.dendrogram(hc_cols),Rowv=as.dendrogram(hc_rows),dendrogram = "both",trace = "none",
col = mycols, keysize = 1, key.title = "Title",
scale = c("row"), cexRow = 0.75, cexCol = 1, srtCol = NULL,
srtRow = NULL, adjRow = c(0,NA), sepcolor="white" ,margins = c(7,15),
lwid = c(2,8),
lhei= c(2,8), main ="All samples", na.color="black") #this is the final one for the heat map
lich_counts<- counts_tab[rowSums(counts_tab[c(2:14,16:19)])>0,] #remove rows which have 0 in all samples
lich<-cor(lich_counts[c(2:14,16:19)], method="pearson",use="pairwise.complete.obs")
lich<-cor(counts_tab[c(2:14,16:19)], method="pearson",use="pairwise.complete.obs")
ord_lich<-c("XBA1","XBA2","XSA1","XSA2","XSA2_2","XTA1","XTA2","XBC1","XBC2","XMC2","XSC1",
"XSC2","XTC2","XBE1","XBE2","XSE2","XTE2")
lich1 <- lich[, ord_lich]
lich1<-lich1[ord_lich, ]
# For correlation plot
corrplot(lich1,method="color",type="full",
main="", is.corr=F, col=col2(200),addgrid.col = 0,tl.cex = 1, tl.col = "black")
# For multidimensional scaling
plotMDS(lich_counts[c(2:14,16:19)],main="Lichen samples")
# For heatmap
heat<-lich_counts[c(1:14,16:19)]
heat1<-column_to_rownames(heat, var = "target_id")
exp_matrix<-data.matrix(heat1)
mycols = colorRampPalette(c("blue","white","red"))(100)
hc_rows <- hclust(as.dist(1-cor(t(exp_matrix), method="pearson")), method="average") # for clustering genes
hc_cols <- hclust(as.dist(1 - cor(exp_matrix, method = "pearson")), method = "average") # to arrange the sample names at x-axis according to hierarchical clustering
htmap<-heatmap.2(exp_matrix, Colv=as.dendrogram(hc_cols),Rowv=as.dendrogram(hc_rows),dendrogram = "both",trace = "none",
col = mycols, keysize = 1, key.title = "Title",
scale = c("row"), cexRow = 0.75, cexCol = 1, srtCol = NULL,
srtRow = NULL, adjRow = c(0,NA), sepcolor="white" ,margins = c(7,15),
lwid = c(2,8),
lhei= c(2,8), main ="lich samples", na.color="black") #this is the final one for the heat map
xan_counts<- counts_tab[rowSums(counts_tab[c(21:32)])>0,] #remove rows which have 0 in xan samples
xan<-cor(xan_counts[c(21:32)], method="pearson",use="pairwise.complete.obs")
ord_xan<-c("KS48XB1","KS48XB2","KS48XB3",
"KS9XB1","KS9XB2","KS9XB3","S_21XB1","KS21XB1","S_21XB3","S_42XB1","S_42XB2","S_42XB3") #arrange order according to biological replicates (optional)
xan1 <- xan[, ord_xan]
xan1<-xan1[ord_xan, ]
corrplot(xan1,method="color",type="full",is.corr=F, col=col2(200),addgrid.col = 0,tl.cex = 1, tl.col = "black")
# For multidimensional scaling
plotMDS(xan_counts[c(21:32)],main="Xanthoria samples")
# For heatmap
heat<-xan_counts[c(1,21:32)]
heat1<-column_to_rownames(heat, var = "target_id")
exp_matrix<-data.matrix(heat1)
mycols = colorRampPalette(c("blue","white","red"))(100)
hc_rows <- hclust(as.dist(1-cor(t(exp_matrix), method="pearson")), method="average") # for clustering genes
hc_cols <- hclust(as.dist(1 - cor(exp_matrix, method = "pearson")), method = "average") # to arrange the sample names at x-axis according to hierarchical clustering
htmap<-heatmap.2(exp_matrix, Colv=as.dendrogram(hc_cols),Rowv=as.dendrogram(hc_rows),dendrogram = "both",trace = "none",
col = mycols, keysize = 1, key.title = "Title",
scale = c("row"), cexRow = 0.75, cexCol = 1, srtCol = NULL,
srtRow = NULL, adjRow = c(0,NA), sepcolor="white" ,margins = c(7,15),
lwid = c(2,8),
lhei= c(2,8), main ="Xanthoria samples", na.color="black") #this is the final one for the heat map
mkdir analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/
source package /tsl/software/testing/bin/trinity-2.14.0
Trinity --seqType fq --max_memory 40G --left ../03_transcriptomic_analysis/analysis_and_temp_files/03_qc/trimmed_reads/KS48XB1_trimmed.bbmapmerged.non_rRNA.1.fq.gz --right ../03_transcriptomic_analysis/analysis_and_temp_files/03_qc/trimmed_reads/KS48XB1_trimmed.bbmapmerged.non_rRNA.2.fq.gz --CPU 20 --output analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_trinity
Trinity --seqType fq --max_memory 40G --left ../03_transcriptomic_analysis/analysis_and_temp_files/03_qc/trimmed_reads/XBE1_trimmed.bbmapmerged.non_rRNA.1.fq.gz --right ../03_transcriptomic_analysis/analysis_and_temp_files/03_qc/trimmed_reads/XBE1_trimmed.bbmapmerged.non_rRNA.2.fq.gz --CPU 20 --output analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_trinity
#align with BLAT
source package /tgac/software/testing/bin/blat-37x1
blat analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_trinity.Trinity.fasta analysis_and_temp_files/06_annotate_lecanoro/GTX0501_pred/annotate_results/Xanthoria_parietina_GTX0501.mrna-transcripts.fa -out=psl -noHead analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_blat.psl
#get list of transcripts that map by selecting the column with cut (48780 lines)
cut -f14 analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_blat.psl > analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_transcripts_headers_mapped_to_GTX501.txt
#remove duplicates with awk (21219 lines)
awk -i inplace '!seen[$0]++' analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_transcripts_headers_mapped_to_GTX501.txt
#extract fastas with seqtk v1.4-r130
source package d6c2932f-b01d-4039-8736-a2e15d4867e2
seqtk subseq analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_trinity.Trinity.fasta analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_transcripts_headers_mapped_to_GTX501.txt > analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_transcripts_headers_mapped_to_GTX501.fasta
fastANI -r analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_transcripts_headers_mapped_to_GTX501.fasta -q analysis_and_temp_files/06_annotate_lecanoro/GTX0501_pred/annotate_results/Xanthoria_parietina_GTX0501.mrna-transcripts.fa -o analysis_and_temp_files/06_annotate_lecanoro/GTX0501_XBE1_ani.txt
#align with BLAT
source package /tgac/software/testing/bin/blat-37x1
blat analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_trinity.Trinity.fasta analysis_and_temp_files/06_annotate_lecanoro/GTX0501_pred/annotate_results/Xanthoria_parietina_GTX0501.mrna-transcripts.fa -out=psl analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_blat.psl
#get list of transcripts that map by selecting the column with cut (48780 lines)
cut -f14 analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_blat.psl > analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_transcripts_headers_mapped_to_GTX501.txt
#remove duplicates with awk (21219 lines)
awk -i inplace '!seen[$0]++' analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_transcripts_headers_mapped_to_GTX501.txt
#extract fastas with seqtk v1.4-r130
source package d6c2932f-b01d-4039-8736-a2e15d4867e2
seqtk subseq analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_trinity.Trinity.fasta analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_transcripts_headers_mapped_to_GTX501.txt > analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_transcripts_headers_mapped_to_GTX501.fasta
fastANI -r analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_transcripts_headers_mapped_to_GTX501.fasta -q analysis_and_temp_files/06_annotate_lecanoro/GTX0501_pred/annotate_results/Xanthoria_parietina_GTX0501.mrna-transcripts.fa -o analysis_and_temp_files/06_annotate_lecanoro/GTX0501_KS48XB1_filtered_ani.txt
fastANI -r analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_transcripts_headers_mapped_to_GTX501.fasta -q analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_transcripts_headers_mapped_to_GTX501.fasta -o analysis_and_temp_files/06_annotate_lecanoro/XBE1_KS48XB1_filtered_ani.txt